#Executive Summary

The anaylysis below is a workforce analysis of Frito Lay performed by DDS Analytics. This analysis covers the following general workforce overview. An analysis of attrition and key job roles at risk for worker attrition. An analysis of classification predicive models to predict attrition as well as an analysis of linear models to predict Monthly income.

Github Repository

https://github.com/Chinchillin1981/CaseStudy2DDS

Libraries

library(ggplot2)
library(plyr)
library(dplyr)
library(formattable)
library(caret)
library(knitr)
library(kableExtra)
library(plotly)
library(corrplot)
library(ggthemes)
library(doParallel)
library(parallel)
library(corrplot)
library(purrr)
library(plotly)

Import Sample data

CSData <- read.csv("CaseStudy2-data.csv", header = TRUE)

** Workforce Analysis**

# Average Age
mean(CSData$Age)
## [1] 36.82874
# Average years at company
mean(CSData$YearsAtCompany)
## [1] 6.962069
# average Job satisfaction
mean(CSData$JobSatisfaction)
## [1] 2.709195
# Average years with manager
mean(CSData$YearsWithCurrManager)
## [1] 4.14023
# Average years between promotion
mean(CSData$YearsSinceLastPromotion)
## [1] 2.168966
#Distance from home
mean(CSData$DistanceFromHome)
## [1] 9.33908
#Environment Satisfaction
mean(CSData$EnvironmentSatisfaction)
## [1] 2.701149
#Relationship satisfaction
mean(CSData$RelationshipSatisfaction)
## [1] 2.706897
#Performance Rating
mean(CSData$PerformanceRating)
## [1] 3.151724
#Total Working Years
mean(CSData$TotalWorkingYears)
## [1] 11.05287
# Pie chart of percentage of male and female employees almost 60% of workforce is male
library(plotly)
plot_ly(data = CSData, labels = CSData$Gender, values = plyr::count(CSData$Gender), type = "pie", title = "Gender")
# Pie Chart of rob roles and percentage of workforce
library(plotly)
plot_ly(data = CSData, labels = CSData$JobRole, values = plyr::count(CSData$JobRole), type = "pie", title = "Job Roles")
# Over view of Eduction
mean(CSData$Education)
## [1] 2.901149
plot_ly(data = CSData, labels = CSData$EducationField, values = plyr::count(CSData$EducationField), type = "pie", title = "Education Field")

## Analysis job roles and turnover

#Get Percentage of Roles and turnover
RoleTotal <- plyr::count(CSData$JobRole)
#Format columns
names(RoleTotal)[1] <- "JobRole"
names(RoleTotal)[2] <- "Total"
RoleTotal$JobRole <- as.character(RoleTotal$JobRole)
#Only employees with attrition
AttrY <- dplyr::filter(CSData, CSData$Attrition == "Yes")
#Count of number of employees in role who quit
AttrRole <-  plyr::count(AttrY$JobRole)
#Format Columns Attr Role
names(AttrRole)[1] <- "JobRole"
names(AttrRole)[2] <- "Attr"
AttrRole$JobRole <- as.character(AttrRole$JobRole)
#Merge RoleTotal with AttrRole
Role_Attr_Total <- merge(RoleTotal,AttrRole)
# Add a column that is the calculated percentage of Job Role turnover 
Role_Attr_Total <- mutate(Role_Attr_Total, Attr_Percent = Attr / Total)
#Format the Attr_Percent column to Percentage
Role_Attr_Total$Attr_Percent <- percent(Role_Attr_Total$Attr_Percent)
#Arrange the data in Descending Order by Attribution Percentage
Role_Attr_Total <- Role_Attr_Total %>% arrange(desc(Attr_Percent))

kable(Role_Attr_Total) %>% kable_styling() %>% column_spec(4, bold = TRUE)
JobRole Total Attr Attr_Percent
Sales Representative 53 24 45.28%
Human Resources 27 6 22.22%
Laboratory Technician 153 30 19.61%
Research Scientist 172 32 18.60%
Sales Executive 200 33 16.50%
Healthcare Representative 76 8 10.53%
Manager 51 4 7.84%
Manufacturing Director 87 2 2.30%
Research Director 51 1 1.96%
#Plot of Attrition in each Job role
p <- ggplot(data = CSData, aes(JobRole, fill = (Attrition == "Yes")))
p + geom_bar() + coord_flip() + labs(title = "Attrition in each Job Role", x = "Job Role", y = "Number of Employees") + 
  scale_fill_hc(name = "Attrition", labels = c("Total Employees", "Attrition") ) + theme_linedraw()

Drill down into top three highest job roles with attrition

Sales Representative Attrition analysis

# Sales Rep Attrition
  SalesReps <- filter(CSData, JobRole == "Sales Representative")
  AttrYSales <- filter(AttrY, JobRole == "Sales Representative")

# Average years at the company. Sales Reps that leave on average are at the company less than 2.5 years.
mean(SalesReps$YearsAtCompany)
## [1] 2.924528
mean(AttrYSales$YearsAtCompany)
## [1] 2.375
# Most sales reps that stay with the company are 30 and over
plot(SalesReps$Attrition, SalesReps$Age, xlab = "Attrition", ylab = "Age", main = "Sales Rep Attrition and Age")

# Job Satisfaction Ratings for Sales Reps with Attrition is never above a 3.0 and averages 2.5
plot(SalesReps$Attrition, SalesReps$JobSatisfaction, xlab = "Attrition", ylab = "Job Satisfaction", main = "Sales Rep Attrition and Job Satisfaction")

# Turnover and Distance from home in general sales reps that leave the company live farther away then the 5mile average that sales reps who stay live
plot(SalesReps$Attrition, SalesReps$DistanceFromHome, xlab = "Attrition", ylab = "Distance from home", main = "Sales Rep Attrition and Distance from Home")

#The reps that leave are disproportionatly Single
plot(SalesReps$Attrition, SalesReps$MaritalStatus, xlab = "Attrition", ylab = "Marriage Status", main = "Sales Rep Attrition and Marital Status")

Human Resources Attrition analysis

#Filter on Human Resources
  HR <- filter(CSData, JobRole == "Human Resources")

# HR Rep turnover average age is less thatn 30
plot(HR$Attrition, HR$Age, xlab = "Attrition", ylab = "Age", main = "Human Resources Attrition and Age")

# HR Reps that left company lived much farther from work on average 20 miles away
plot(HR$Attrition, HR$DistanceFromHome, xlab = "Attrition", ylab = "Distance From Home", main = "Human Resources Attrition and Distance from home")

#Job Satisfaction of those that left averaged 2
plot(HR$Attrition, HR$JobSatisfaction, xlab = "Attrition", ylab = "Job Satisfaction", main = "Human Resources Attrition and Job Satisfaction")

# Total working years of those that stayed averaged 7 years those that quite averaged 2
plot(HR$Attrition, HR$TotalWorkingYears, xlab = "attrition", ylab = "Job Satisfaction", main = "Human Resources Atrition and Total Working Years")

# Those that stay have been with the company an average of five years those that leave Less than two
plot(HR$Attrition, HR$YearsAtCompany, xlab = "Attrition", ylab = "Years At Company", main = "Human Rsources Attrition and Years at the Company")

Laboratory Technician Attrition analysis

 # Lab Techs
  LabTech <- filter(CSData, CSData$JobRole == "Laboratory Technician")

#Lab Techs that quit had a much lower Environment Satisfaction than the averge of 3
plot(LabTech$Attrition, LabTech$EnvironmentSatisfaction, xlab = "Attrition", ylab = "Environment Satisfaction", main = "Lab Tech Attrition and Environment Satisfaction")

#Lab Tech attrition and Age
plot(LabTech$Attrition, LabTech$Age, xlab = "Attrition", ylab = "Age", main = "Lab Tech Attrition and Age")

#Lab tech and distance from home
plot(LabTech$Attrition, LabTech$DistanceFromHome, xlab = "Attrition", ylab = "Distance From Home", main = "Lab Tech Attrition and Distance from home")

#Lab tech and total working years
plot(LabTech$Attrition, LabTech$TotalWorkingYears, xlab = "Attrition", ylab = "Total Working Years", main = "Lab Tech Attrition and Total working years")

# Single lab techs are significantly more likely to leave
plot(LabTech$Attrition, LabTech$MaritalStatus, xlab = "Attrition",ylab = "Marital Status",main = "Lab Tech Attrition and Marital Status" )

Predictive model to identify employees that are likely to leave the company

Find coorolation to Attrition

#Look at the proportion of each variable that influences attrition

#Step 1 remove data that is not going to be useful for finding attrition ID, Employee Number, Standard Hours, and Over18
CSData_AttrUseful <- CSData %>% select(- c(ID,EmployeeNumber, StandardHours, Over18, EmployeeCount))

#Create a function that will create a plot for each variable
AttrPlot <- function(df, x, y){
  ggplot(data = df, aes_string(x = x, fill = y)) +
    geom_bar(alpha = .9, position = "fill") +
    coord_flip() + labs(x = x, y = "Attrition") + theme_hc()+ scale_fill_hc()
}

yname <- "Attrition"
xname <- names(CSData_AttrUseful[-ncol(CSData_AttrUseful)])

lapply(xname, function(x) AttrPlot(df = CSData_AttrUseful, x = x, y = yname))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

Test Classifier models naive bayes and knn for best fit

#Remove columns that are not useful
CSData_AttrUseful <- CSData %>% select(- c(ID,EmployeeNumber, StandardHours, Over18, EmployeeCount))

#Create training and test data
set.seed(8)
TrainObs <- createDataPartition(y = CSData_AttrUseful$Attrition, p = .60, list = FALSE)
#Create the training observations for Attrition
AttrTrain <- CSData_AttrUseful[TrainObs,]

#Create the test Observations for Attrition
AttrTest <- CSData_AttrUseful[-TrainObs,]
#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number =  25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

#Check number of cores for parallel processing
parallel::detectCores() #4 cores detected on iMac used for study
## [1] 4
#Assign cores to run this training model
workers <- makeCluster(3L)

#Sets up workers to run training
registerDoParallel(workers)

Naive Bayes method

#Fit the Naives Bayes model
fit.nb <- train(Attrition ~., data = AttrTrain, method = "nb", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)

Predict,Summary, and assessment of Naive Bayes model

#Predictions based on Naives Bayes method
pred.nb <- predict(fit.nb, AttrTest)

#Summary of Naives Bayes predicions
summary(pred.nb)
##  No Yes 
## 214 134
#Confusion Matrix to assess model
confusionMatrix(pred.nb, AttrTest$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  195  19
##        Yes  97  37
##                                          
##                Accuracy : 0.6667         
##                  95% CI : (0.6144, 0.716)
##     No Information Rate : 0.8391         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2102         
##                                          
##  Mcnemar's Test P-Value : 8.724e-13      
##                                          
##             Sensitivity : 0.6678         
##             Specificity : 0.6607         
##          Pos Pred Value : 0.9112         
##          Neg Pred Value : 0.2761         
##              Prevalence : 0.8391         
##          Detection Rate : 0.5603         
##    Detection Prevalence : 0.6149         
##       Balanced Accuracy : 0.6643         
##                                          
##        'Positive' Class : No             
## 

KNN method

fit.knn <- train(Attrition ~., data = AttrTrain, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)

Predict,Summary, and assessment of KNN model

#Predictions based on Naives Bayes method
pred.knn <- predict(fit.knn, AttrTest)

#Summary of Naives Bayes predicions
summary(pred.knn)
##  No Yes 
## 333  15
#Confusion Matrix to assess model
confusionMatrix(pred.knn, AttrTest$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  287  46
##        Yes   5  10
##                                           
##                Accuracy : 0.8534          
##                  95% CI : (0.8119, 0.8889)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.2588          
##                                           
##                   Kappa : 0.2293          
##                                           
##  Mcnemar's Test P-Value : 2.13e-08        
##                                           
##             Sensitivity : 0.9829          
##             Specificity : 0.1786          
##          Pos Pred Value : 0.8619          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.8391          
##          Detection Rate : 0.8247          
##    Detection Prevalence : 0.9569          
##       Balanced Accuracy : 0.5807          
##                                           
##        'Positive' Class : No              
## 

Classification Model fit conclusion

Based on the output of both models the KNN model has high Accuracy and Sensitivity but it’s specificity is only 18%. The Naive Bayes models meets all the criteria required for Accuracy, Sensitiy, and Specificity all being over 60%.

Predicting Monthly Income

The most significant variables that corelate to Monthly income are Job Level (95%) and Total Working years (78%)

#function to create corrolation heatmap
correlator <- function(df){
  df %>%
    keep(is.numeric) %>%
    tidyr::drop_na() %>%
    cor %>%
    corrplot(addCoef.col = "white", number.digits = 2,
             number.cex = .5, method = "square",
             order = "hclust",
             tl.srt = 45, tl.cex = .8)
}

correlator(CSData_AttrUseful)

Comparing Linear regression models to predict Monthly Income using a simple linear model and knn regression

# Create the training and test data for the Monthly Income models
set.seed(12)
TrainObs <- createDataPartition(y = CSData_AttrUseful$Attrition, p = .60, list = FALSE)

#Create the training observations for Monthly Income
MITrain <- CSData_AttrUseful[TrainObs,]

#Create the test Observations for Monthly Income
MITest <- CSData_AttrUseful[-TrainObs,]
# Set the training method for the regression models
trainMethod2 <- trainControl(method = "repeatedcv", number =  25, repeats = 5)

Fit a simple linear regression model

# Fit lm model
fit.lm <- train(MonthlyIncome ~., data = MITrain, method = "lm", trControl = trainMethod2)

# Check RMSE of linear model
fit.lm
## Linear Regression 
## 
## 522 samples
##  30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times) 
## Summary of sample sizes: 501, 501, 501, 500, 502, 501, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1088.389  0.9385737  853.8985
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Fit a knn regression model

# Fit knn regression model
fit.knnreg <- train(MonthlyIncome ~., data = MITrain, method = "knn", trControl = trainMethod2)

# Check RMSE of knn regression model
fit.knnreg
## k-Nearest Neighbors 
## 
## 522 samples
##  30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times) 
## Summary of sample sizes: 501, 499, 501, 502, 500, 501, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE      Rsquared    MAE     
##   5  4924.749  0.04703560  3749.325
##   7  4756.036  0.04941159  3620.643
##   9  4691.593  0.04871913  3606.306
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.

Regression Model fit Conclucsion

The simple linear model had a lower RMSE than the knn model and a much higher Rsquared. The linear regression model is a better fit than the knn